Anumeha Mishra - CS 544 Term Project December 7, 2020
This dataset has been taken from kaggle and the link is as below: https://www.kaggle.com/sakshigoyal7/credit-card-customers
This dataset contains personal information of the customers such as their age, marital status, income,etc.
This dataset contains 10,128 rows and 20 columns.
The columns attributes are :
CLIENTNUM, Attrition flag,Customer_Age , Gender , Dependent_count , Education_Level, Marital_Status, Income_Category, Card_Category, Months_on_book, Total_Relationship_Count, Months_Inactive, Contacts_Count,Credit_Limit,Total_Revolving_Bal, Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
The objective of this project is to gain an insight on customers banking and show visualizations on different trends based on their age, gender and marital status.
You should consider writing a function whenever you’ve copied and pasted a block of code more than twice (i.e. you now have three copies of the same code). For example, take a look at this code. What does it do?
data <- as.data.frame(read.csv("/Users/anumehamishra/Desktop/BankChurners.csv", header=T, sep=",", na.strings = c("","NA")))
head(data)
## CLIENTNUM Attrition_Flag Customer_Age Gender Dependent_count
## 1 768805383 Existing Customer 45 M 3
## 2 818770008 Existing Customer 49 F 5
## 3 713982108 Existing Customer 51 M 3
## 4 769911858 Existing Customer 40 F 4
## 5 709106358 Existing Customer 40 M 3
## 6 713061558 Existing Customer 44 M 2
## Education_Level Marital_Status Income_Category Card_Category Months_on_book
## 1 High School Married $60K - $80K Blue 39
## 2 Graduate Single Less than $40K Blue 44
## 3 Graduate Married $80K - $120K Blue 36
## 4 High School Unknown Less than $40K Blue 34
## 5 Uneducated Married $60K - $80K Blue 21
## 6 Graduate Married $40K - $60K Blue 36
## Total_Relationship_Count Months_Inactive Contacts_Count Credit_Limit
## 1 5 1 3 12691
## 2 6 1 2 8256
## 3 4 1 0 3418
## 4 3 4 1 3313
## 5 5 1 0 4716
## 6 3 1 2 4010
## Total_Revolving_Bal Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct
## 1 777 1.335 1144 42
## 2 864 1.541 1291 33
## 3 0 2.594 1887 20
## 4 2517 1.405 1171 20
## 5 0 2.175 816 28
## 6 1247 1.376 1088 24
## Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
## 1 1.625 0.061
## 2 3.714 0.105
## 3 2.333 0.000
## 4 2.333 0.760
## 5 2.500 0.000
## 6 0.846 0.311
table(data$Gender)
##
## F M
## 5358 4769
barplot(table(data$Gender), col = "cyan", ylim = c(0,6000), las = 2, xlab = "Gender",ylab = "Number of customers")
table(data$Marital_Status)
##
## Divorced Married Single Unknown
## 748 4687 3943 749
barplot(table(data$Marital_Status), col = "cyan", ylim = c(0,5000), las = 2, xlab = "Marital_Status",ylab = "Number of customers")
Dependents <- data[data$Dependent_count == '5', ]
table(Dependents$Gender)
##
## F M
## 225 199
barplot(table(Dependents$Gender), col = "red", ylim = c(0,250), density=c(10,20),las = 2, xlab = "Dependents = 5", ylab = "Number of customers")
Education_Level <- table(data$Education_Level)
slice.labels <- names(Education_Level)
slice.percents <- round(Education_Level/sum(Education_Level)*100)
slice.labels <- paste(slice.labels, slice.percents)
slice.labels <- paste(slice.labels, "%", sep = "")
pie(Education_Level, labels = slice.labels, col = hcl(c(0, 60, 120)))
fivenum(data$Customer_Age)
## [1] 26 41 46 52 73
boxplot(data$Customer_Age, horizontal = TRUE, xaxt = "n", xlab = "Age of customers", col=hcl(1))
axis(side = 1, at=fivenum(data$Customer_Age), labels = TRUE)
age_status <- fivenum(data$Customer_Age)
age_status
## [1] 26 41 46 52 73
library(ggplot2)
ggplot(data, aes(x=Customer_Age)) +
geom_histogram(color="black", fill="blue") + facet_grid(~Attrition_Flag)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Dependents <- data[data$Dependent_count == '5',]
head(Dependents)
## CLIENTNUM Attrition_Flag Customer_Age Gender Dependent_count
## 2 818770008 Existing Customer 49 F 5
## 11 708790833 Existing Customer 42 M 5
## 59 711427458 Existing Customer 44 F 5
## 74 820582308 Existing Customer 42 M 5
## 141 789322833 Attrited Customer 48 F 5
## 156 713786508 Existing Customer 42 F 5
## Education_Level Marital_Status Income_Category Card_Category Months_on_book
## 2 Graduate Single Less than $40K Blue 44
## 11 Uneducated Unknown $120K + Blue 31
## 59 Graduate Married Unknown Blue 35
## 74 Uneducated Married $80K - $120K Blue 37
## 141 High School Married Less than $40K Blue 38
## 156 Unknown Married $40K - $60K Blue 36
## Total_Relationship_Count Months_Inactive Contacts_Count Credit_Limit
## 2 6 1 2 8256
## 11 5 3 2 6748
## 59 4 1 2 6273
## 74 6 2 2 22913
## 141 1 3 3 8025
## 156 3 3 3 2038
## Total_Revolving_Bal Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct
## 2 864 1.541 1291 33
## 11 1467 0.831 1201 42
## 59 978 2.275 1359 25
## 74 1528 0.414 1394 35
## 141 0 0.654 673 18
## 156 0 0.786 1238 28
## Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
## 2 3.714 0.105
## 11 0.680 0.217
## 59 1.083 0.156
## 74 0.522 0.067
## 141 0.800 0.000
## 156 0.750 0.000
A <- Dependents$Customer_Age
B <- Dependents$Customer_Age
boxplot(A,B, xaxt = "n", xlab = "variations of age with 5 dependents", ylab = "Age of Customers", col=c("yellow", "green"))
data2=data.frame(Age=data$Customer_Age,Dependent=data$Dependent_count,Transactions=data$Total_Trans_Ct)
head(data2,10)
## Age Dependent Transactions
## 1 45 3 42
## 2 49 5 33
## 3 51 3 20
## 4 40 4 20
## 5 40 3 28
## 6 44 2 24
## 7 51 4 31
## 8 32 0 36
## 9 37 3 24
## 10 48 2 32
plot(data2 , pch=20 , cex=1.0 , col="lightblue")
ggplot(data = data) +
geom_point(mapping = aes(x = Customer_Age, y = Dependent_count, colour = Education_Level ))
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
a <- plot_ly(data, x = data$Customer_Age, type="box", name = 'age')
b <-add_trace(a, x = data$Months_on_book, type="box", name = 'Months')
c <-add_trace(b , x = data$Contacts_Count,type = "box" , name ="Contacts_made" )
c
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
values <- data$Customer_Age
tab <- table(values)
dframe <- as.data.frame(tab)
dframe
## values Freq
## 1 26 78
## 2 27 32
## 3 28 29
## 4 29 56
## 5 30 70
## 6 31 91
## 7 32 106
## 8 33 127
## 9 34 146
## 10 35 184
## 11 36 221
## 12 37 260
## 13 38 303
## 14 39 333
## 15 40 361
## 16 41 379
## 17 42 426
## 18 43 473
## 19 44 500
## 20 45 486
## 21 46 490
## 22 47 479
## 23 48 472
## 24 49 495
## 25 50 452
## 26 51 398
## 27 52 376
## 28 53 387
## 29 54 307
## 30 55 279
## 31 56 262
## 32 57 223
## 33 58 157
## 34 59 157
## 35 60 127
## 36 61 93
## 37 62 93
## 38 63 65
## 39 64 43
## 40 65 101
## 41 66 2
## 42 67 4
## 43 68 2
## 44 70 1
## 45 73 1
x <- as.numeric(as.character(dframe$values))
x
## [1] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## [26] 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 70 73
The probability distribution is:
f <- dframe$Freq / (sum(dframe$Freq))
The mean:
mu <- sum(x * f)
mu
## [1] 46.32596
Variance of the distribution is:
sigmaSquare <- sum((x - mu)^2 * f)
sigmaSquare
## [1] 64.26296
sigma <- sqrt(sigmaSquare)
sigma
## [1] 8.016418
Showing probability distribution of age:
plot(x, f, type = 'h', xlab = "Age", ylab = "PMF", ylim = c(0, 0.06), main
= "Spike plot for Age")
abline(h = 0 )
Showing cumulative distribution of age:
cdf <- c(0, cumsum(f))
cdfplot <- stepfun(x, cdf)
plot(cdfplot, verticals=FALSE, pch=16, main="CDF Plot for Age", xlab = "Age", ylab = "C
DF")
values2 <- data$Dependent_count
tab <- table(values2)
dframe <- as.data.frame(tab)
dframe
## values2 Freq
## 1 0 904
## 2 1 1838
## 3 2 2655
## 4 3 2732
## 5 4 1574
## 6 5 424
x <- as.numeric(as.character(dframe$values2))
x
## [1] 0 1 2 3 4 5
The probability distribution is:
f <- dframe$Freq / (sum(dframe$Freq))
The mean:
mu <- sum(x * f)
mu
## [1] 2.346203
The variance of the distribution is:
sigmaSquare <- sum((x - mu)^2 * f)
sigmaSquare
## [1] 1.686996
sigma <- sqrt(sigmaSquare)
sigma
## [1] 1.298844
Showing probability distribution of Income category:
plot(x, f, type = 'h', xlab = "Income_Category", ylab = "PMF", ylim = c(0, 0.5), main
= "Spike plot for Income")
abline(h = 0 )
Showing cumulative distribution of Income category
cdf <- c(0, cumsum(f))
cdfplot <- stepfun(x, cdf)
plot(cdfplot, verticals=FALSE, pch=16, main="CDF Plot for Income", xlab = "Age", ylab = "C
DF")
Age <-data$Customer_Age
ctable <- table(Age)
The mean is :
mu <- mean(Age)
mu
## [1] 46.32596
sigma <- sd(Age)
sigma
## [1] 8.016814
Histogram showing the age of the customers:
dframe <- as.data.frame(ctable)
x <- as.numeric(as.character(data$Customer_Age))
head(x)
## [1] 45 49 51 40 40 44
hist(x, probability = TRUE, xlim = c(0, 80), xlab = "Age", ylab = "Density", main = "Histogram of age")
samples <- 1000
sample_size <- 5
xbar <- numeric(samples)
for(i in 1:samples){
xbar[i] = mean(sample(x, size = sample_size, replace = T))
}
hist(xbar, prob = T, xlab = "Age",
main = "Densities of age with sample size 5", col = "purple")
Mean:
mean1 <- mean(xbar)
mean1
## [1] 46.3198
Standard deviation
sd1 <- sd(xbar)
sd1
## [1] 3.640844
samples <- 1000
sample_size <- 20
xbar <- numeric(samples)
for(i in 1:samples){
xbar[i] = mean(sample(x, size = sample_size, replace = T))
}
hist(xbar, prob = T, xlab = "Age",
main = "Densities of age with sample size 20", col = "red")
Mean:
mean2 <- mean(xbar)
mean2
## [1] 46.2087
Standard deviation
sd2 <- sd(xbar)
sd2
## [1] 1.781045
samples <- 1000
sample_size <- 50
xbar <- numeric(samples)
for(i in 1:samples){
xbar[i] = mean(sample(x, size = sample_size, replace = T))
}
hist(xbar, prob = T, xlab = "Age",
main = "Densities of age with sample size 50", col = "brown")
Mean:
mean3 <- mean(xbar)
mean3
## [1] 46.31612
Standard deviation
sd3 <- sd(xbar)
sd3
## [1] 1.104157
cat("1st distribution:\nMean =",mean1,"\nSD =",sd1)
## 1st distribution:
## Mean = 46.3198
## SD = 3.640844
cat("2nd distribution:\nMean =",mean2,"\nSD =",sd2)
## 2nd distribution:
## Mean = 46.2087
## SD = 1.781045
cat("3rd distribution:\nMean =",mean3,"\nSD =",sd3)
## 3rd distribution:
## Mean = 46.31612
## SD = 1.104157
library(sampling)
table(data$Customer_Age)
##
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
## 78 32 29 56 70 91 106 127 146 184 221 260 303 333 361 379 426 473 500 486
## 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
## 490 479 472 495 452 398 376 387 307 279 262 223 157 157 127 93 93 65 43 101
## 66 67 68 70 73
## 2 4 2 1 1
hist(data$Customer_Age,col="yellow")
sample.size <- 200
s <- srswor(sample.size,nrow(data))
sample.1 <- data[s != 0, ]
mean_srswor <- mean(sample.1$Customer_Age)
set.seed(153)
s <- srswr(sample.size, nrow(data))
sample.2 <- data[s != 0, ]
mean_srswr <- mean(sample.2$Customer_Age)
set.seed(123)
N <- nrow(data)
n <- 200
k <- ceiling(N / n)
k
## [1] 51
r <- sample(k, 1)
r
## [1] 31
s <- seq(r, by = k, length = n)
head(s)
## [1] 31 82 133 184 235 286
sample.3 <- data[s, ]
table(sample.3$Customer_Age)
##
## 27 28 29 31 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
## 1 1 2 1 4 4 3 4 7 8 8 2 11 12 8 12 8 7 5 17 8 6 4 9 5 3
## 55 56 57 58 59 60 61 62 64 65
## 4 10 6 3 2 2 5 2 1 3
mean_systematic <- mean(sample.3$Customer_Age)
mean_systematic
## [1] NA
pik <- inclusionprobabilities(data$Customer_Age, sample.size)
s <- UPsystematic(pik)
sample.4 <- data[s != 0, ]
table(sample.4$Customer_Age)
##
## 26 27 28 30 31 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
## 1 1 2 4 2 2 2 3 5 3 3 5 5 7 4 16 10 10 7 10 10 9 8 7 12 8
## 54 55 56 57 58 59 60 62 64 65 67
## 9 4 6 10 3 3 2 2 1 3 1
data["age_range"] = NA
data$age_range <- cut(data$Customer_Age, breaks = c(0, 25, 50, 75, Inf), labels = c('A', 'B', 'C','D'))
data_age <- data.frame(
Customer_Age = data$Customer_Age,
age_range = data$age_range
)
freq <- table(data_age$age_range)
freq
##
## A B C D
## 0 7049 3078 0
set.seed(123)
head(data_age)
## Customer_Age age_range
## 1 45 B
## 2 49 B
## 3 51 C
## 4 40 B
## 5 40 B
## 6 44 B
cl <- cluster(data, c("Customer_Age"), size = 4, method = "srswor")
sample.6 <- getdata(data, cl)
table(sample.6$Customer_Age)
##
## 28 39 40 56
## 29 333 361 262
mean_cluster <- mean(sample.6$Customer_Age)
mean_cluster
## [1] 43.56447
mean_srswor
## [1] 45.82
mean_srswr
## [1] 46.0603
mean_systematic
## [1] NA
sample.size <- 700
s <- srswor(sample.size,nrow(data))
sample.1 <- data[s != 0, ]
mean_srswor <- mean(sample.1$Customer_Age)
set.seed(153)
s <- srswr(sample.size, nrow(data))
sample.2 <- data[s != 0, ]
mean_srswr <- mean(sample.2$Customer_Age)
N <- nrow(data)
n <- 700
k <- ceiling(N / n)
k
## [1] 15
r <- sample(k, 1)
r
## [1] 10
s <- seq(r, by = k, length = n)
head(s)
## [1] 10 25 40 55 70 85
sample.3 <- data[s, ]
table(sample.3$Customer_Age)
##
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
## 5 2 3 1 7 3 7 4 15 6 10 20 21 23 21 33 29 21 38 40 36 27 38 32 26 23
## 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
## 21 27 22 12 18 18 18 11 11 7 5 4 2 7 1
mean_systematic <- mean(sample.3$Customer_Age)
pik <- inclusionprobabilities(data$Customer_Age, sample.size)
s <- UPsystematic(pik)
sample.4 <- data[s != 0, ]
table(sample.4$Customer_Age)
##
## 26 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
## 2 1 5 5 3 5 9 8 16 12 19 16 18 17 20 23 34 27 42 26 36 36 42 39 35 25
## 53 54 55 56 57 58 59 60 61 62 63 64 65 67
## 20 23 29 26 14 8 14 10 12 9 2 3 8 1
freq <- table(data_age$age_range)
freq
##
## A B C D
## 0 7049 3078 0
set.seed(123)
head(data_age)
## Customer_Age age_range
## 1 45 B
## 2 49 B
## 3 51 C
## 4 40 B
## 5 40 B
## 6 44 B
mean_cluster
## [1] 43.56447
mean_systematic
## [1] NA
mean_srswr
## [1] 46.47563
mean_srswor
## [1] 46.24